In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math

In [2]:
data = pd.read_csv('HCEPDB/HCEPDB_moldata.csv')

Read the first 10 rows.


In [3]:
data.head(10)


Out[3]:
id SMILES_str stoich_str mass pce voc jsc e_homo_alpha e_gap_alpha e_lumo_alpha tmp_smiles_str
0 655365 C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1 C18H9N3OSSe 394.3151 5.161953 0.867601 91.567575 -5.467601 2.022944 -3.444656 C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
1 1245190 C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]... C22H15NSeSi 400.4135 5.261398 0.504824 160.401549 -5.104824 1.630750 -3.474074 C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
2 21847 C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1... C24H17NOSi 363.4903 0.000000 0.000000 197.474780 -4.539526 1.462158 -3.077368 C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
3 65553 [SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1 C12H12SeSi3 319.4448 6.138294 0.630274 149.887545 -5.230274 1.682250 -3.548025 C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
4 720918 C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1 C20H12OSSe 379.3398 1.991366 0.242119 126.581347 -4.842119 1.809439 -3.032680 C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1
5 1310744 C1C=CC=C1c1cc2[se]c3c(c4nsnc4c4ccncc34)c2c2ccc... C24H13N3SSe 454.4137 5.605135 0.951911 90.622776 -5.551911 2.029717 -3.522194 C1=CC=C(C1)c1cc2[se]c3c(c4nsnc4c4ccncc34)c2c2c...
6 196637 C1C=CC=C1c1cc2[se]c3cc4ccsc4cc3c2[se]1 C17H10SSe2 404.2520 2.644436 0.587932 69.223461 -5.187932 2.201106 -2.986827 C1=CC=C(C1)c1cc2[se]c3cc4ccsc4cc3c2[se]1
7 262174 C1C=CC=C1c1cc2[se]c3c4occc4c4cscc4c3c2[se]1 C19H10OSSe2 444.2730 2.523057 0.397670 97.645325 -4.997670 1.982122 -3.015548 C1=CC=C(C1)c1cc2[se]c3c4occc4c4cscc4c3c2[se]1
8 393249 C1C=CC=C1c1cc2[se]c3cc4cccnc4cc3c2c2ccccc12 C24H15NSe 396.3495 3.115895 0.869140 55.174815 -5.469140 2.331815 -3.137325 C1=CC=C(C1)c1cc2[se]c3cc4cccnc4cc3c2c2ccccc12
9 35 C1C2=C([SiH2]C=C2)C=C1c1cc2occc2c2cscc12 C17H12OSSi 292.4328 2.743214 0.387106 109.062905 -4.987106 1.909966 -3.077141 C1=CC2=C([SiH2]1)C=C(C2)c1cc2occc2c2cscc12

Read the last 10 rows.


In [4]:
data.tail(10)


Out[4]:
id SMILES_str stoich_str mass pce voc jsc e_homo_alpha e_gap_alpha e_lumo_alpha tmp_smiles_str
2322839 1703911 C1cc2c(ccc(-c3cccnc3)c2c1)-c1sc(-c2scc3cc[SiH2... C26H19NS2Si2 465.7471 4.881051 0.657693 114.218791 -5.257693 1.876279 -3.381414 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322840 1814506 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(c3[SiH2]ccc23)-... C23H16N2S3Si2 472.7634 3.353182 0.461167 111.904241 -5.061167 1.892000 -3.169167 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
2322841 2559314 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(c3[SiH2]ccc23)-... C23H15NOS3Si2 473.7475 4.263377 0.688326 95.325067 -5.288326 1.998713 -3.289613 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3[SiH2]ccc23)-...
2322842 2351086 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C24H16N2S3Si2 484.7744 6.662663 0.850060 120.627407 -5.450060 1.839686 -3.610374 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322843 1712111 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c... C24H12OS6Si 536.8398 2.951709 0.279912 162.292795 -4.879912 1.615145 -3.264767 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...
2322844 2543603 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1cnc(... C22H14N4S3Si2 486.7506 0.000000 0.000000 0.000000 -5.632512 1.454082 -4.178430 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1cnc(...
2322845 2304057 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C22H14N4S3Si2 486.7506 9.335485 1.120744 128.197094 -5.720744 1.798600 -3.922144 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322846 2007035 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C26H18S3Si2 482.7982 2.498209 0.834995 46.046052 -5.434995 2.433160 -3.001835 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(...
2322847 1961981 C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1ccc(cc1)-c1... C25H16S3SeSi 519.6454 2.679067 0.659243 62.544032 -5.259243 2.258468 -3.000775 c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(cc1)-...
2322848 2754558 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c... C24H13NOS5Si 519.7887 1.272400 0.102802 190.489616 -4.702802 1.490950 -3.211851 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c...

Show the number of elements.


In [5]:
data.shape


Out[5]:
(2322849, 11)

Show the column names.


In [6]:
data.columns


Out[6]:
Index(['id', 'SMILES_str', 'stoich_str', 'mass', 'pce', 'voc', 'jsc',
       'e_homo_alpha', 'e_gap_alpha', 'e_lumo_alpha', 'tmp_smiles_str'],
      dtype='object')

Make id column the index.


In [7]:
data.set_index('id').head(5)


Out[7]:
SMILES_str stoich_str mass pce voc jsc e_homo_alpha e_gap_alpha e_lumo_alpha tmp_smiles_str
id
655365 C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1 C18H9N3OSSe 394.3151 5.161953 0.867601 91.567575 -5.467601 2.022944 -3.444656 C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1
1245190 C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]... C22H15NSeSi 400.4135 5.261398 0.504824 160.401549 -5.104824 1.630750 -3.474074 C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH...
21847 C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1... C24H17NOSi 363.4903 0.000000 0.000000 197.474780 -4.539526 1.462158 -3.077368 C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc...
65553 [SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1 C12H12SeSi3 319.4448 6.138294 0.630274 149.887545 -5.230274 1.682250 -3.548025 C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si...
720918 C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1 C20H12OSSe 379.3398 1.991366 0.242119 126.581347 -4.842119 1.809439 -3.032680 C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1

Show the data types of each column.


In [8]:
data.dtypes


Out[8]:
id                  int64
SMILES_str         object
stoich_str         object
mass              float64
pce               float64
voc               float64
jsc               float64
e_homo_alpha      float64
e_gap_alpha       float64
e_lumo_alpha      float64
tmp_smiles_str     object
dtype: object

Start to manage data.

Now I try to calculate the standard diviation of the mass.


In [9]:
data['(xi-x)^2'] = (data['mass'] - data['mass'].mean())**2
data.head()


Out[9]:
id SMILES_str stoich_str mass pce voc jsc e_homo_alpha e_gap_alpha e_lumo_alpha tmp_smiles_str (xi-x)^2
0 655365 C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1 C18H9N3OSSe 394.3151 5.161953 0.867601 91.567575 -5.467601 2.022944 -3.444656 C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1 451.517873
1 1245190 C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]... C22H15NSeSi 400.4135 5.261398 0.504824 160.401549 -5.104824 1.630750 -3.474074 C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH... 229.539163
2 21847 C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1... C24H17NOSi 363.4903 0.000000 0.000000 197.474780 -4.539526 1.462158 -3.077368 C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc... 2711.675432
3 65553 [SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1 C12H12SeSi3 319.4448 6.138294 0.630274 149.887545 -5.230274 1.682250 -3.548025 C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si... 9238.910207
4 720918 C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1 C20H12OSSe 379.3398 1.991366 0.242119 126.581347 -4.842119 1.809439 -3.032680 C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1 1312.196283

In [39]:
SD = math.sqrt(sum(data['(xi-x)^2'])/data.shape[0])
M = data['mass'].mean()
print('SD = ',SD,', mean = ',M)


SD =  59.853157780691824 , mean =  415.564049928

Group the data by the standard diviation of the mass.


In [55]:
data['mass_group'] = pd.cut(data['mass'],bins=[min(data['mass']),M-3*SD,M-2*SD,M-SD,M+SD,M+2*SD,M+3*SD,max(data['mass'])],labels=["<(-3SD)","-3SD~-2SD","-2SD~-SD","-SD~+SD","+SD~+2SD","+2SD~+3SD",">(+3SD)"])

In [56]:
data


Out[56]:
id SMILES_str stoich_str mass pce voc jsc e_homo_alpha e_gap_alpha e_lumo_alpha tmp_smiles_str (xi-x)^2 mass_group
0 655365 C1C=CC=C1c1cc2[se]c3c4occc4c4nsnc4c3c2cn1 C18H9N3OSSe 394.3151 5.161953 0.867601 91.567575 -5.467601 2.022944 -3.444656 C1=CC=C(C1)c1cc2[se]c3c4occc4c4nsnc4c3c2cn1 451.517873 -SD~+SD
1 1245190 C1C=CC=C1c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH2]... C22H15NSeSi 400.4135 5.261398 0.504824 160.401549 -5.104824 1.630750 -3.474074 C1=CC=C(C1)c1cc2[se]c3c(ncc4ccccc34)c2c2=C[SiH... 229.539163 -SD~+SD
2 21847 C1C=c2ccc3c4c[nH]cc4c4c5[SiH2]C(=Cc5oc4c3c2=C1... C24H17NOSi 363.4903 0.000000 0.000000 197.474780 -4.539526 1.462158 -3.077368 C1=CC=C(C1)C1=Cc2oc3c(c2[SiH2]1)c1c[nH]cc1c1cc... 2711.675432 -SD~+SD
3 65553 [SiH2]1C=CC2=C1C=C([SiH2]2)C1=Cc2[se]ccc2[SiH2]1 C12H12SeSi3 319.4448 6.138294 0.630274 149.887545 -5.230274 1.682250 -3.548025 C1=CC2=C([SiH2]1)C=C([SiH2]2)C1=Cc2[se]ccc2[Si... 9238.910207 -2SD~-SD
4 720918 C1C=c2c3ccsc3c3[se]c4cc(oc4c3c2=C1)C1=CC=CC1 C20H12OSSe 379.3398 1.991366 0.242119 126.581347 -4.842119 1.809439 -3.032680 C1=CC=C(C1)c1cc2[se]c3c4sccc4c4=CCC=c4c3c2o1 1312.196283 -SD~+SD
5 1310744 C1C=CC=C1c1cc2[se]c3c(c4nsnc4c4ccncc34)c2c2ccc... C24H13N3SSe 454.4137 5.605135 0.951911 90.622776 -5.551911 2.029717 -3.522194 C1=CC=C(C1)c1cc2[se]c3c(c4nsnc4c4ccncc34)c2c2c... 1509.295311 -SD~+SD
6 196637 C1C=CC=C1c1cc2[se]c3cc4ccsc4cc3c2[se]1 C17H10SSe2 404.2520 2.644436 0.587932 69.223461 -5.187932 2.201106 -2.986827 C1=CC=C(C1)c1cc2[se]c3cc4ccsc4cc3c2[se]1 127.962474 -SD~+SD
7 262174 C1C=CC=C1c1cc2[se]c3c4occc4c4cscc4c3c2[se]1 C19H10OSSe2 444.2730 2.523057 0.397670 97.645325 -4.997670 1.982122 -3.015548 C1=CC=C(C1)c1cc2[se]c3c4occc4c4cscc4c3c2[se]1 824.203814 -SD~+SD
8 393249 C1C=CC=C1c1cc2[se]c3cc4cccnc4cc3c2c2ccccc12 C24H15NSe 396.3495 3.115895 0.869140 55.174815 -5.469140 2.331815 -3.137325 C1=CC=C(C1)c1cc2[se]c3cc4cccnc4cc3c2c2ccccc12 369.198929 -SD~+SD
9 35 C1C2=C([SiH2]C=C2)C=C1c1cc2occc2c2cscc12 C17H12OSSi 292.4328 2.743214 0.387106 109.062905 -4.987106 1.909966 -3.077141 C1=CC2=C([SiH2]1)C=C(C2)c1cc2occc2c2cscc12 15161.304709 -3SD~-2SD
10 1048612 C1C=CC=C1C1=Cc2sc3cc4C=C[SiH2]c4cc3c2C1 C18H14SSi 290.4606 2.408411 0.431315 85.937708 -5.031315 2.065850 -2.965465 C1=CC=C(C1)C1=Cc2sc3cc4C=C[SiH2]c4cc3c2C1 15650.873184 -3SD~-2SD
11 917542 C1C=c2ccc3[se]c4c5[se]c(cc5[se]c4c3c2=C1)C1=CC... C20H12Se3 489.1948 2.843278 0.302591 144.614366 -4.902591 1.708198 -3.194393 C1=CC=C(C1)c1cc2[se]c3c([se]c4ccc5=CCC=c5c34)c... 5421.487356 +SD~+2SD
12 1441831 C1C=CC=C1C1=Cc2ncc3c4[se]ccc4cnc3c2C1 C18H12N2Se 335.2668 2.687240 0.675497 61.225278 -5.275497 2.270953 -3.004544 C1=CC=C(C1)C1=Cc2ncc3c4[se]ccc4cnc3c2C1 6447.648346 -2SD~-SD
13 1376296 C1C=CC=C1C1=Cc2c(C1)c1[se]c3ccc4cscc4c3c1c1=C[... C24H16SSeSi 443.5024 2.844637 0.189206 231.387394 -4.789206 1.312334 -3.476872 C1=CC=C(C1)C1=Cc2c(C1)c1[se]c3ccc4cscc4c3c1c1=... 780.551405 -SD~+SD
14 1638442 C1C=c2ccc3cnc4c5[SiH2]C(=Cc5c5nsnc5c4c3c2=C1)C... C23H15N3SSi 393.5445 6.462512 0.602405 165.105179 -5.202405 1.603165 -3.599240 C1=CC=C(C1)C1=Cc2c([SiH2]1)c1ncc3ccc4=CCC=c4c3... 484.860579 -SD~+SD
15 98350 C1C=CC=C1C1=Cc2ccc3c4CC=Cc4c4cscc4c3c2[SiH2]1 C22H16SSi 340.5204 2.631463 0.410851 98.573546 -5.010851 1.975707 -3.035144 C1=CC=C(C1)C1=Cc2ccc3c4CC=Cc4c4cscc4c3c2[SiH2]1 5631.549395 -2SD~-SD
16 2162747 C1C=CC=C1C1=Cc2c([SiH2]1)c1c3c[nH]cc3c3ccc4=C[... C27H19NOSi2 429.6251 2.039158 0.140744 222.981280 -4.740744 1.361137 -3.379607 C1=CC=C(C1)C1=Cc2c([SiH2]1)c1c3c[nH]cc3c3ccc4=... 197.713129 -SD~+SD
17 557119 C1C=c2c3C=C(Cc3c3occc3c2=C1)C1=CC=CC1 C19H14O 258.3186 0.237205 0.024962 146.246545 -4.624962 1.700415 -2.924547 C1=CC=C(C1)C1=Cc2c(C1)c1occc1c1=CCC=c21 24726.131523 -3SD~-2SD
18 753728 C1C=CC=C1C1=Cc2c([SiH2]1)c1cc3ncccc3cc1c1c[nH]... C22H16N2Si 336.4684 3.103831 0.409504 116.650708 -5.009504 1.863416 -3.146088 C1=CC=C(C1)C1=Cc2c([SiH2]1)c1cc3ncccc3cc1c1c[n... 6256.121838 -2SD~-SD
19 819265 C1C=CC=C1C1=Cc2c([SiH2]1)c1c(c3cscc23)c2[se]cc... C23H16SSeSi2 459.5774 5.385253 0.368606 224.848916 -4.968606 1.352309 -3.616298 C1=CC=C(C1)C1=Cc2c([SiH2]1)c1c(c3cscc23)c2[se]... 1937.174985 -SD~+SD
20 1278019 C1C=CC=C1C1=Cc2c([SiH2]1)c1c(c3[SiH2]C=Cc3c3=C... C23H18OSi3 394.6522 5.489489 0.301242 280.455932 -4.901242 1.135619 -3.765623 C1=CC=C(C1)C1=Cc2c([SiH2]1)c1c(c3[SiH2]C=Cc3c3... 437.305467 -SD~+SD
21 2096063 C1C=CC=C1c1cc2[se]c3c(c2c2cscc12)c1ccccc1c1ccc... C27H14N2S2Se 509.5136 6.204093 0.570055 167.497914 -5.170055 1.593078 -3.576977 C1=CC=C(C1)c1cc2[se]c3c(c2c2cscc12)c1ccccc1c1c... 8826.517959 +SD~+2SD
22 2752585 C1C=CC=C1C1=Cc2c(C1)c1c(c3c[nH]cc23)c2c3c[nH]c... C28H20N2Si 412.5660 0.000000 0.000000 198.749914 -4.499447 1.457208 -3.042239 C1=CC=C(C1)C1=Cc2c(C1)c1c(c3c[nH]cc23)c2c3c[nH... 8.988303 -SD~+SD
23 1572945 C1C=CC=C1C1=Cc2[se]c3c4sccc4c4ccccc4c3c2C1 C22H14SSe 389.3786 2.167252 0.330623 100.884304 -4.930623 1.961253 -2.969370 C1=CC=C(C1)C1=Cc2[se]c3c4sccc4c4ccccc4c3c2C1 685.677788 -SD~+SD
24 2359381 C1C=CC=C1C1=Cc2c(C1)c1c3cscc3c3ccc4nsnc4c3c1c1... C26H14N2OS2 434.5416 4.112982 0.299549 211.318161 -4.899549 1.409229 -3.490319 C1=CC=C(C1)C1=Cc2c(C1)c1c3cscc3c3ccc4nsnc4c3c1... 360.147407 -SD~+SD
25 1540183 C1C=CC=C1c1cc2[se]c3c([se]c4ccc5cscc5c34)c2cn1 C20H11NSSe2 455.2999 3.212565 0.683568 72.329945 -5.283568 2.174712 -3.108856 C1=CC=C(C1)c1cc2[se]c3c([se]c4ccc5cscc5c34)c2cn1 1578.937781 -SD~+SD
26 1638500 C1C=CC=C1c1cc2[se]c3ccc4ccccc4c3c2c2cocc12 C23H14OSe 385.3226 3.088844 0.482262 98.573546 -5.082262 1.977235 -3.105027 C1=CC=C(C1)c1cc2[se]c3ccc4ccccc4c3c2c2cocc12 914.545294 -SD~+SD
27 2621542 C1C=c2c3ccccc3c3c4ccccc4c4C=C(Cc4c3c2=C1)C1=CC... C29H20 368.4770 2.552886 0.341115 115.180406 -4.941115 1.872759 -3.068355 C1=CC=C(C1)C1=Cc2c(C1)c1c(c3ccccc23)c2ccccc2c2... 2217.190271 -SD~+SD
28 98411 C1C=CC=C1c1cc2[se]c3cc4cccnc4cc3c2c2cscc12 C22H13NSSe 402.3777 4.247356 0.653960 99.957476 -5.253960 1.967245 -3.286715 C1=CC=C(C1)c1cc2[se]c3cc4cccnc4cc3c2c2cscc12 173.879824 -SD~+SD
29 524398 C1C=c2c3C=C([SiH2]c3c3ncc4ccc5nsnc5c4c3c2=C1)C... C23H15N3SSi 393.5445 5.860942 0.497394 181.348711 -5.097394 1.533947 -3.563447 C1=CC=C(C1)C1=Cc2c([SiH2]1)c1ncc3ccc4nsnc4c3c1... 484.860579 -SD~+SD
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2322819 2705444 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C25H17NS3Si2 483.7863 2.976815 0.892533 51.330433 -5.492533 2.373489 -3.119045 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 4654.275405 +SD~+2SD
2322820 2925216 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4occc34)c... C24H12O2S5Si 520.7728 3.687312 0.323482 175.431612 -4.923482 1.558371 -3.365111 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4occc34)c... 11068.881092 +SD~+2SD
2322821 2742210 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccoc34)c... C24H12O2S5Si 520.7728 3.036407 0.280599 166.541080 -4.880599 1.596420 -3.284179 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccoc34)c... 11068.881092 +SD~+2SD
2322822 3092419 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C23H15N3S3Si2 485.7625 5.766431 1.000112 88.737230 -5.600112 2.045365 -3.554748 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 4927.822392 +SD~+2SD
2322823 1253317 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C23H17NS2Si2 427.6983 2.569183 1.021842 38.695335 -5.621842 2.523390 -3.098452 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 147.240025 -SD~+SD
2322824 1841096 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C25H17NOS2Si2 467.7193 3.651471 0.838712 67.004278 -5.438712 2.220525 -3.218187 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 2720.170110 -SD~+SD
2322825 2770889 C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1ccc(-c2cccc... C26H17NS3Si 467.7113 3.294399 0.667854 75.917576 -5.267854 2.143414 -3.124440 c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(-c2cc... 2719.335690 -SD~+SD
2322826 1816522 C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1sc(-c2ccccc... C25H16S4Si 472.7514 3.297434 0.473489 107.179926 -5.073489 1.921142 -3.152347 c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1sc(-c2ccc... 3270.393008 -SD~+SD
2322827 1810382 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C25H17NOS2Si2 467.7193 3.581623 0.762095 72.329945 -5.362095 2.171842 -3.190253 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 2720.170110 -SD~+SD
2322828 1648591 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccoc34)c... C24H12O3S4Si 504.7058 2.780562 0.264955 161.513282 -4.864955 1.618879 -3.246076 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccoc34)c... 7946.251606 +SD~+2SD
2322829 2705360 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccoc34)c... C24H13NO2S4Si 503.7217 1.063303 0.087194 187.679800 -4.687194 1.502985 -3.184210 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccoc34)c... 7771.771266 +SD~+2SD
2322830 2349009 C1ccc2csc(c12)-c1ccc(cn1)-c1sc(-c2scc3cc[SiH2]... C24H17NS3Si2 471.7753 2.802896 0.911719 47.314404 -5.511719 2.421182 -3.090538 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 3159.704635 -SD~+SD
2322831 3091107 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c... C24H14OS5Si2 534.8756 3.770352 0.412894 140.537136 -5.012894 1.732059 -3.280835 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c... 14235.245980 +SD~+2SD
2322832 8152 [SiH2]1ccc2csc(c12)-c1sc(-c2scc3cc[se]c23)c2[s... C18H10S3Se2Si 508.4810 2.887419 0.549016 80.941730 -5.149016 2.101911 -3.047105 c1sc(c2[SiH2]ccc12)-c1sc(-c2scc3cc[se]c23)c2[s... 8633.559611 +SD~+2SD
2322833 1781722 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C23H16N2S3Si2 472.7634 2.814019 0.556938 77.762059 -5.156938 2.127099 -3.029839 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 3271.765649 -SD~+SD
2322834 2470223 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4sccc34)c... C24H13NS6Si 535.8557 2.445740 0.207560 181.348711 -4.807560 1.533100 -3.274460 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4sccc34)c... 14470.081077 +2SD~+3SD
2322835 2469856 C1ccc2c1c(sc2-c1sc(-c2scc3cc[SiH2]c23)c2ccoc12... C25H15NOS4Si 501.7495 2.143418 0.227460 145.026911 -4.827460 1.707258 -3.120202 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3Cccc23)-c2scc... 7427.931804 +SD~+2SD
2322836 1912803 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccoc34)c... C24H12O3S4Si 504.7058 2.656897 0.274521 148.952385 -4.874521 1.686757 -3.187764 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccoc34)c... 7946.251606 +SD~+2SD
2322837 1216485 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1cccc... C18H12N2S3Si2 408.6768 7.594213 0.993521 117.639554 -5.593521 1.857476 -3.736045 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1cccc... 47.434212 -SD~+SD
2322838 2619366 C1cc2c(ccc(-c3ccccc3)c2c1)-c1sc(-c2scc3cc[SiH2... C28H20S2Si 448.6840 3.743223 0.466049 123.612430 -5.066049 1.824004 -3.242045 c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(-c2cc... 1096.931093 -SD~+SD
2322839 1703911 C1cc2c(ccc(-c3cccnc3)c2c1)-c1sc(-c2scc3cc[SiH2... C26H19NS2Si2 465.7471 4.881051 0.657693 114.218791 -5.257693 1.876279 -3.381414 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 2518.338514 -SD~+SD
2322840 1814506 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(c3[SiH2]ccc23)-... C23H16N2S3Si2 472.7634 3.353182 0.461167 111.904241 -5.061167 1.892000 -3.169167 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3[SiH2]ccc23)-... 3271.765649 -SD~+SD
2322841 2559314 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(c3[SiH2]ccc23)-... C23H15NOS3Si2 473.7475 4.263377 0.688326 95.325067 -5.288326 1.998713 -3.289613 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(c3[SiH2]ccc23)-... 3385.313862 -SD~+SD
2322842 2351086 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C24H16N2S3Si2 484.7744 6.662663 0.850060 120.627407 -5.450060 1.839686 -3.610374 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 4790.072557 +SD~+2SD
2322843 1712111 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c... C24H12OS6Si 536.8398 2.951709 0.279912 162.292795 -4.879912 1.615145 -3.264767 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c... 14707.807555 +2SD~+3SD
2322844 2543603 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1cnc(... C22H14N4S3Si2 486.7506 0.000000 0.000000 0.000000 -5.632512 1.454082 -4.178430 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1cnc(... 5067.524911 +SD~+2SD
2322845 2304057 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C22H14N4S3Si2 486.7506 9.335485 1.120744 128.197094 -5.720744 1.798600 -3.922144 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 5067.524911 +SD~+2SD
2322846 2007035 [SiH2]1ccc2csc(c12)-c1sc(c2[SiH2]ccc12)-c1ccc(... C26H18S3Si2 482.7982 2.498209 0.834995 46.046052 -5.434995 2.433160 -3.001835 c1sc(c2[SiH2]ccc12)-c1sc(c2[SiH2]ccc12)-c1ccc(... 4520.430936 +SD~+2SD
2322847 1961981 C1ccc2c1c(sc2-c1scc2cc[SiH2]c12)-c1ccc(cc1)-c1... C25H16S3SeSi 519.6454 2.679067 0.659243 62.544032 -5.259243 2.258468 -3.000775 c1sc(c2[SiH2]ccc12)-c1sc(c2Cccc12)-c1ccc(cc1)-... 10832.927433 +SD~+2SD
2322848 2754558 [SiH2]1ccc2csc(c12)-c1sc(-c2sc(-c3scc4ccsc34)c... C24H13NOS5Si 519.7887 1.272400 0.102802 190.489616 -4.702802 1.490950 -3.211851 c1sc(c2[SiH2]ccc12)-c1sc(-c2sc(-c3scc4ccsc34)c... 10862.777683 +SD~+2SD

2322849 rows × 13 columns


In [57]:
pd.value_counts(data['mass_group'])


Out[57]:
-SD~+SD      1603364
-2SD~-SD      323872
+SD~+2SD      290594
-3SD~-2SD      66373
+2SD~+3SD      34290
<(-3SD)         3144
>(+3SD)         1211
Name: mass_group, dtype: int64

Now I calculate how many data are wihtin 1 standard diviation, 2 standard diviatioa, and 3 standard diviation respectively.


In [58]:
1603364/data.shape[0]


Out[58]:
0.690257524272994

In [59]:
(1603364+323872+290594)/data.shape[0]


Out[59]:
0.9547887098989215

In [60]:
(1603364+323872+290594+66373+34290)/data.shape[0]


Out[60]:
0.998124716673361

Though I didn't know what these data are, I found the mass of these chemicals are nearly normal distribution!


In [61]:
data.groupby(['mass_group'])['voc'].describe()


Out[61]:
mass_group       
<(-3SD)     count    3.144000e+03
            mean     6.219819e-01
            std      4.210254e-01
            min      0.000000e+00
            25%      2.923413e-01
            50%      5.870970e-01
            75%      8.999475e-01
            max      2.178957e+00
-3SD~-2SD   count    6.637300e+04
            mean     6.665345e-01
            std      3.581035e-01
            min      0.000000e+00
            25%      4.086398e-01
            50%      6.538720e-01
            75%      9.035142e-01
            max      2.135583e+00
-2SD~-SD    count    3.238720e+05
            mean     6.320939e-01
            std      3.352676e-01
            min      0.000000e+00
            25%      3.982995e-01
            50%      6.221896e-01
            75%      8.524997e-01
            max      2.103816e+00
-SD~+SD     count    1.603364e+06
            mean     5.465694e-01
            std      2.958476e-01
            min      0.000000e+00
            25%      3.423544e-01
            50%      5.364526e-01
            75%      7.420294e-01
            max      1.991945e+00
+SD~+2SD    count    2.905940e+05
            mean     4.830717e-01
            std      2.730553e-01
            min      0.000000e+00
            25%      2.971842e-01
            50%      4.845037e-01
            75%      6.709221e-01
            max      1.713406e+00
+2SD~+3SD   count    3.429000e+04
            mean     4.177375e-01
            std      2.301736e-01
            min      0.000000e+00
            25%      2.761383e-01
            50%      4.348518e-01
            75%      5.779685e-01
            max      1.274693e+00
>(+3SD)     count    1.211000e+03
            mean     3.944460e-01
            std      1.581920e-01
            min      0.000000e+00
            25%      3.047716e-01
            50%      3.960159e-01
            75%      4.899024e-01
            max      8.624765e-01
Name: voc, dtype: float64

Get a plot voc v.s. grouped data


In [62]:
data.groupby(['mass_group'])['voc'].mean().plot()


Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x106070208>

Get another plot.


In [72]:
data.groupby(['mass_group'])['tmp_smiles_str'].count().plot().hist(20)


Out[72]:
(array([ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.]),
 array([ 19.5,  19.6,  19.7,  19.8,  19.9,  20. ,  20.1,  20.2,  20.3,
         20.4,  20.5]),
 <a list of 10 Patch objects>)

I found the distribution of the amount of 'tmp_smiles_str' is as same as what I got from doing 'mass', nearly normal distribution.


In [ ]: